syntax = "proto3";

package tensorflow.profiler;

import "google/protobuf/any.proto";
import "tensorflow/core/profiler/protobuf/op_metrics.proto";

// Breakdown of step-time on generic hardware. Note that these components are
// mutually exclusive so that adding them together is equal to the step time. If
// an execution time interval has multiple types of event happening, we need to
// pick one of the event type to attribute the time interval to.
message GenericStepBreakdown {
  // Map event type to the accumulated duration in
  // picoseconds of that type.
  map<int32, uint64> type_ps = 1;

  // Map of string category to accumulated duration in picoseconds for
  // that category.
  map<string, uint64> category_ps = 2;
}

// Information about memory transfer to/from device memory.
message DeviceMemoryTransfer {
  uint64 occurrence = 1;
  double time_us = 2;
  uint64 bytes_transferred = 3;
}

// Next ID: 7
// Result proto for StepInfo.
message StepInfoResult {
  // The step number.
  uint32 step_num = 1;
  // The step name.
  string step_name = 5;
  // The step duration in picoseconds.
  uint64 duration_ps = 2;
  // The start time of this step in picoseconds.
  uint64 begin_ps = 3;
  // Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.
  google.protobuf.Any step_breakdown = 4;
  // Total time/bytes/occurences for collectives. (All-Reduce, All-to-All etc)
  DeviceMemoryTransfer collectives = 6;
}

// Result proto for all -educe ops.
message AllReduceInfo {
  // Unique id for all-reduce ops.
  uint64 id = 1;
  // The name of the hlo op. This field is no longer set by the profiler.
  string name = 2 [deprecated = true];
  // For all-reduce nodes from different modules, if they have the same
  // all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be
  // applied across modules.
  uint64 all_reduce_id = 3;
  // The start time in picoseconds of the op event.
  uint64 start_time_ps = 4;
  // The end time in picoseconds of the op event.
  uint64 end_time_ps = 5;
  // The size of the op in bytes.
  uint64 byte_size = 6;
}

// Result database for all-reduce ops.
message AllReduceDbResult {
  repeated AllReduceInfo all_reduce_info = 1;
}

// Result proto for information in a step across all cores.
message PerCoreStepInfo {
  // The step number.
  uint32 step_num = 1;
  // A map from core_id to StepInfo.
  map<uint32, StepInfoResult> step_info_per_core = 2;
  // The result for the per-step HLO-metric database.
  OpMetricsDb hlo_metrics_db = 3;
  // A map from core ID to program replica id. Replica id map could change
  // during a profile session, but should stay stable within a step.
  map<uint32, uint32> core_id_to_replica_id_map = 5;
  // A map from core_id to all-reduce ops.
  map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6;
  // Information about deivce memory transfers, categoried by source and
  // destination. Ordered by following categories:
  // 1. HostToDevice
  // 2. DeviceToHost
  // 3. DeviceToDevice
  // Cores are normally sharing host interfaces (i.e. PCIe).
  repeated DeviceMemoryTransfer device_memory_transfers = 7;

  reserved 4;
}

// Result proto for a StepDatabase.
message StepDatabaseResult {
  // A sequence of PerCoreStepInfo.
  repeated PerCoreStepInfo step_sequence = 1;
  // Whether the step db uses incomplete step information.
  // This flag is set to true when:
  // 1) no step marker or annotation present.
  // 2) profiling duration is too short to cover a full step.
  // If this flag is false, we will group and breakdown the
  // profile by complete steps only and ignore incomplete steps.
  // If this flag is true, we will simply aggregate and breakdown over the total
  // profile as a single step.
  bool use_incomplete_step = 2;
  // Number of steps dropped during post processing.
  uint32 num_steps_dropped = 3;
  // If the step_sequence is empty because:
  //   * there is no step profiled on any host, then empty_intersect is false.
  //   * there are steps profiled on some host, but the intersection of steps
  //     over all hosts is empty, then empty_intersect is true.
  bool empty_intersect = 4;
}
